xxxxxxxxxx# Table of content<ul>Data Wrangling<li><a href="#preprocessing">Preprocessing Text</a></li>Data Viz<li><a href="#word_freq">Word Frequency</a></li> <li><a href="#freqword">Word Occurence Time Series </a></li><li><a href="#topic">Topic Modelling</a></li> <li><a href="#coherent_model">Topic Modelling Coherent </a></li> <li><a href="#cluster_topic">Topic Clustering of documents</a></li><li><a href="#similar_topic">Similarity Topic Finder</a></li><li><a href="#dominant_topic">Dominant Topic in documents</a></li><li><a href="#ner"> Named Entity Recognition</a></li></ul># pip install pandas matplotlib bs4 wordcloud seaborn spacy nltlk install gensim pyLDAvis numpy re string import pandas as pdimport numpy as npimport reimport stringimport unicodedata"""data visualization"""import matplotlib.pyplot as pltfrom os import pathfrom PIL import Imagefrom wordcloud import WordCloud, STOPWORDS, ImageColorGeneratorimport plotly.express as pximport seaborn as snsfrom matplotlib import rc"""to supress warning"""import warningswarnings.filterwarnings("ignore",category=DeprecationWarning)"""text cleaning"""from bs4 import BeautifulSoup"""stopwords, tokenizer, stemmer"""import spacyimport nltk from nltk.corpus import stopwordsfrom nltk.tokenize import word_tokenizefrom nltk.stem import PorterStemmerfrom nltk.probability import FreqDistfrom nltk.sentiment.vader import SentimentIntensityAnalyzerfrom nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet#nltk.download('punkt')#nltk.download('wordnet')"""additional processing"""import gensimimport gensim.corpora as corporafrom gensim.parsing.preprocessing import STOPWORDSfrom gensim.parsing.preprocessing import remove_stopwordsfrom gensim.corpora import Dictionary from gensim.models.ldamodel import LdaModelfrom gensim.models import CoherenceModelfrom gensim.utils import simple_preprocess from collections import Counter"""ML modelling"""# Enable logging for gensim - optional#import logging#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)#conda install -c conda-forge pyldavisfrom sklearn.decomposition import LatentDirichletAllocation#pip install flairimport pyLDAvisfrom matplotlib import gridspecimport math#pip install pyLDAvis.sklearnimport pyLDAvis.sklearnfrom sklearn.feature_extraction.text import CountVectorizerfrom nltk.stem import WordNetLemmatizerfrom nltk.corpus import wordnet#nltk.download('averaged_perceptron_tagger')import warningswarnings.filterwarnings("ignore", category=DeprecationWarning)global_df = pd.read_csv('global_df.csv')global_df.head(5)global_df.shapeglobal_df = global_df.iloc[global_df.astype(str).drop_duplicates().index]global_df.shape# getting rows with topic/post contentglobal_df = global_df[global_df.cooked != '0']global_df.shapexxxxxxxxxx<a id='preprocessing'></a>## 1 - Preprocessing Topic/Post Text * Data Wrangling* Text cleaning* Text normalization (Lemmatization)* Text Tokenization / Remove stop wordsglobal_df['creation_date'] = pd.to_datetime(global_df['created_at'], format='%Y-%m-%d %H:%M:%S.%f') global_df['creation_year'] = global_df['creation_date'].dt.year# Remove html code <p>def remove_html(text): pattern = re.compile('<[^<]+?>') text = re.sub(pattern, ' ', text) return textglobal_df['noHTML_text'] = global_df.cooked.apply(lambda x: remove_html(x))global_df.to_csv('global_df.csv', header = True, encoding='utf-8', index=False)# Function to clean textdef clean_text(text): # Remove html code <p> pattern = re.compile('<[^<]+?>') text = re.sub(pattern, ' ', text) # Remove hyperlinks text = re.sub(r'https?:\/\/\S+','',text) # lowercase text =text.lower() # Remove ticks, apostrophes and letters after " 's " text = re.sub("\’\w+", '', text) text = re.sub("\'\w+", '', text) # Remove all types of nums text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) text = re.sub(r"\b\d+\b", ' ', text) # Remove specific punctuations text = re.sub('[^A-Za-z0-9]+', ' ', text) # Remove accents "é" text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') # remove the hashtag symbol # text = re.sub(r'#','',text) # text = re.sub(r'[-./]', ' ', text) return textx
global_df['cleaned_text'] = global_df.cooked.apply(lambda x: clean_text(x))# Compare between Original Text & Clean Textpd.options.display.max_colwidth = 300global_df[['cooked', 'cleaned_text']].head(1)# to avoid OS error run following command# !python3 -m spacy download en_core_web_smnltk.download('stopwords')x
# Holistic stop words functionnlp = spacy.load('en_core_web_sm')set1 = list(nlp.Defaults.stop_words)set2 = list(STOPWORDS)set3 = list(stopwords.words('english'))extra_set = ['ill', 'hi', 'im', 'ive', 'dont', 'hello', 'hey', 'like', 'thanks', 'maybe', 'q', 'e g', 'think', 'good', 'com', 'hop', 'thats', 'c', 'b', 'l', 'il', 'x', 'z', 'v', 'f', 'e', 'q', 'isnt', 'wont', 'yes', 'want', 'let', 'know', 'id', 'g', 'thing', 'thank', 'come', 'mm', 'w', 'non', 'day', 'look', 'baby', 'n', 'lot', 'way', 'use', 'try', 'hour', 'couple', 'week', 'ago', 'i', 'h', 'et', 'img', 'cool', 'year', 'u', 'need', 'nice', 'guy', 'boy', 'pm', '3rd', 'pretty', 'sure', 'bit', 'week', 'minute', 'png', 'screen', 'shot', 'sans', 'serif', 'mon', 'mar', 'kb', 'gmt', 'arial', 'helvetica', 'feb', 'long', 'period', 'time', 'font', 'size', 'medium', 'interested', 'dat', 'drank', 'dad', 'bottle', 'turn', 'number', 'everyday', 'follow', 'life', 'jar', 'j','www', 'happen', 'yeah', 'wish', 'love', 'cheer', 'fun', 'later', 'bad', 'unfortunately', 'org', 'july', 'ki', 'february', 'month', 'saw', 'potentially', '21st', 'hf', 'rr', 'lf', 'wait', 'yay', 'especially', 'feel', 'go']stop_words_list = set1 + set2 + set3 + extra_set# remove stopwordsdef remove_stopwords(text): slist = [word for word in text.split() if word not in stop_words_list] text = (" ").join(slist) return textglobal_df['cleaned_text'] = global_df.cleaned_text.apply(lambda x: remove_stopwords(x))# Function To Lemmatize Words To Its Root Formlemmatizer = WordNetLemmatizer()# function to convert nltk tag to wordnet tagdef nltk_tag_to_wordnet_tag(nltk_tag): if nltk_tag.startswith('J'): return wordnet.ADJ elif nltk_tag.startswith('V'): return wordnet.VERB elif nltk_tag.startswith('N'): return wordnet.NOUN elif nltk_tag.startswith('R'): return wordnet.ADV else: return None # Lemmatization functiondef lemmatize_sentence(sentence): #tokenize the sentence and find the POS tag for each token nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence)) #tuple of (token, wordnet_tag) wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged) lemmatized_sentence = [] for word, tag in wordnet_tagged: if tag is None: #if there is no available tag, append the token as is lemmatized_sentence.append(word) else: #else use the tag to lemmatize the token lemmatized_sentence.append(lemmatizer.lemmatize(word, tag)) return " ".join(lemmatized_sentence)# to avoid errors with nltk library download the following nltk.download('punkt')nltk.download('wordnet')global_df['lemmat_text'] = global_df.cleaned_text.apply(lambda x: lemmatize_sentence(x))x
global_df['lemmat_text'] = global_df.lemmat_text.apply(lambda x: remove_stopwords(x))# Function To Lemmatize Words To Its Root Form and keep only NOUN 'NN' POS taglemmatizer = WordNetLemmatizer()# function to convert nltk tag to wordnet tagdef nltk_tag_to_n_tag(nltk_tag): if nltk_tag.startswith('N'): return wordnet.NOUN else: return None # Lemmatization functiondef lemma_sentence(sentence): #tokenize the sentence and find the POS tag for each token nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence)) #tuple of (token, wordnet_tag) wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_n_tag(x[1])), nltk_tagged) lemmatized_sentence = [] for word, tag in wordnet_tagged: if tag is None: pass else: lemmatized_sentence.append(lemmatizer.lemmatize(word, tag)) return " ".join(lemmatized_sentence)global_df['lemma_LDA_text'] = global_df.cleaned_text.apply(lambda x: lemma_sentence(x))global_df['lemma_LDA_text'] = global_df.lemma_LDA_text.apply(lambda x: remove_stopwords(x))# Tokenize Text def tokenize_text(text): filtered_text = [] words = word_tokenize(text) for word in words: filtered_text.append(word) return filtered_text# tokenize all POS textglobal_df['token_text'] = global_df.lemmat_text.apply(lambda x: tokenize_text(x))# tokenize NOUN only POS textglobal_df['token_NN_text'] = global_df.lemma_LDA_text.apply(lambda x: tokenize_text(x))global_df[['lemmat_text', 'lemma_LDA_text']].head()global_df.to_csv('global_df.csv', header = True, encoding='utf-8', index=False)global_df.columnsmylist = []for string in global_df['lemmat_text']: mylist.append(string)mylistnew_text = str(mylist)#tokenizationtxt_tokens = word_tokenize(new_text)words=[word.lower() for word in txt_tokens if word.isalpha()]# Frequency Distribution Plotfdist = FreqDist(words)df_fdist = pd.DataFrame(fdist.items(), columns=['word', 'frequency'])#top 30 words df_fdist30 = df_fdist.sort_values(['frequency'], ascending=False).head(30)#ploting top 20 words plt.figure(figsize=(20,10))sns.catplot(y="word", x="frequency", kind="bar", palette="vlag", data=df_fdist30, height=8.27, aspect=11.7/8.27)plt.title('TOP 30 frequent words which occurred in the posts/topics')xxxxxxxxxxdf_21 = global_df[(global_df['creation_year'] == 2021) | (global_df['creation_year'] == 2020)]x
mylist = []for string in df_21['cleaned_text']: mylist.append(string)mylistnew_text = str(mylist)#tokenizationtxt_tokens = word_tokenize(new_text)words_21 = [word.lower() for word in txt_tokens if word.isalpha()]# Frequency Distribution Plotfdist = FreqDist(words_21)df_fdist = pd.DataFrame(fdist.items(), columns=['word', 'frequency'])#top 30 words df_fdist30 = df_fdist.sort_values(['frequency'], ascending=False).head(30)df_fdist30# using list comprehension listToStr= ' '.join(map(str, words)) # Create and generate a word cloud image:wordcloud = WordCloud(width=1600, height=800, max_font_size=200, max_words=100, colormap='vlag', background_color="white", collocations=True).generate(listToStr)# Display the generated image:plt.figure(figsize=(20,10))plt.imshow(wordcloud)plt.title('Wordcloud of TOP 100 frequent words for all posts/topics')plt.axis("off")plt.show()# using list comprehension QS_word = ' '.join(map(str, words))# open the image and use np.array to transform the file to an arraycand_mask= np.array(Image.open('QS.png'))# take all values greater than 3 and transfer 255 (white)# if they are less than 3, they will be whatever value they are in the arraycand_mask= np.where(cand_mask > 3, 255, cand_mask)#create and generate our wordcloud objectwordcloud = WordCloud(background_color='white', contour_color='blue', mask=cand_mask, contour_width=2).generate(QS_word)#plotplt.figure(figsize=(20,10))plt.imshow(wordcloud, interpolation='bilinear')plt.title('Wordcloud of TOP frequent words for Users posts/topics')plt.axis('off')plt.show()xxxxxxxxxx## 4 - Bigrams- List most occured bigrams in the posts/topics to explore more in depth#top 20 bigrams bigrams_series = (pd.Series(nltk.ngrams(words, 2)).value_counts())[:20]bigrams_top = pd.DataFrame(bigrams_series.sort_values(ascending=False))bigrams_top = bigrams_top.reset_index().rename(columns={'index': 'bigrams', 0:'counts'})bigrams_topplt.figure(figsize=(20,10))sns.catplot(x = 'counts' , y='bigrams', kind="bar", palette="vlag", data=bigrams_top, height=8.27, aspect=11.7/8.27)plt.title('TOP 20 pair words which occurred in the topics/posts')#top 20 trigrams trigrams_series = (pd.Series(nltk.ngrams(words, 3)).value_counts())[:20]trigrams_top = pd.DataFrame(trigrams_series.sort_values(ascending=False))trigrams_top = trigrams_top.reset_index().rename(columns={'index': 'trigrams', 0:'counts'})trigrams_topplt.figure(figsize=(20,10))sns.catplot(x = 'counts' , y='trigrams', kind="bar", palette="vlag", data=trigrams_top, height=8.27, aspect=11.7/8.27)plt.title('TOP 20 three words which occurred together in the topics/posts')#top 20 trigrams fourgrams_series = (pd.Series(nltk.ngrams(words, 4)).value_counts())[:20]fourgrams_top = pd.DataFrame(fourgrams_series.sort_values(ascending=False))fourgrams_top = fourgrams_top.reset_index().rename(columns={'index': 'fourgrams', 0:'counts'})fourgrams_topplt.figure(figsize=(20,10))sns.catplot(x = 'counts' , y='fourgrams', kind="bar", palette="vlag", data=fourgrams_top, height=8.27, aspect=11.7/8.27)plt.title('TOP 20 three words which occurred together in the topics/posts')xxxxxxxxxx### 4.1 Keyword Analysis: Lexical Dispersion Plottest = global_df.copy()# calculating the most frequent words' occurence in each thread test['data'] = test['no_sw_text'].str.count('data')test['track'] = test['no_sw_text'].str.count('track')test['sleep'] = test['no_sw_text'].str.count('sleep')test['work'] = test['no_sw_text'].str.count('work')test['app'] = test['no_sw_text'].str.count('app')test['device'] = test['no_sw_text'].str.count('device')test['self'] = test['no_sw_text'].str.count('self')test['people'] = test['no_sw_text'].str.count('people')test['help'] = test['no_sw_text'].str.count('help')test['test'] = test['no_sw_text'].str.count('test')test['start'] = test['no_sw_text'].str.count('start')test['health'] = test['no_sw_text'].str.count('health')test['measure'] = test['no_sw_text'].str.count('measure')test['zeo'] = test['no_sw_text'].str.count('zeo')test['post'] = test['no_sw_text'].str.count('post')test['new'] = test['no_sw_text'].str.count('new')test['question'] = test['no_sw_text'].str.count('question')test['rate'] = test['no_sw_text'].str.count('rate')test['user'] = test['no_sw_text'].str.count('user')test['heart'] = test['no_sw_text'].str.count('heart')test['change'] = test['no_sw_text'].str.count('change')test['project'] = test['no_sw_text'].str.count('project')test['sensor'] = test['no_sw_text'].str.count('sensor')test['tool'] = test['no_sw_text'].str.count('tool')test['different'] = test['no_sw_text'].str.count('different')test['activity'] = test['no_sw_text'].str.count('activity')test['idea'] = test['no_sw_text'].str.count('idea')test['apps'] = test['no_sw_text'].str.count('apps')test['record'] = test['no_sw_text'].str.count('record')test['research'] = test['no_sw_text'].str.count('research')words_df = test.groupby('creation_year')['data', 'track', 'sleep', 'work', 'app', 'device', 'self', 'people', 'help', 'test','start', 'health', 'measure', 'zeo', 'post', 'new', 'question', 'rate', 'user', 'heart', 'change', 'project', 'sensor', 'tool', 'different', 'activity', 'idea', 'apps', 'record', 'research'].sum()words_dfwords_df.columns[0:30]x
<a id='freqword'></a>## Most frequent words in QS over the years of 2011 to 2021import plotly.graph_objs as gofig = px.line(words_df, x=words_df.index, y=words_df.columns[0:30])fig.show()df = global_df.copy()df['no_sw_LDA_text'] = df['no_sw_LDA_text'].astype('str')xxxxxxxxxx## Merge the different threads of the topic_id into one topic to have 2117 documentsxxxxxxxxxxdf = df.groupby(['topic_id'], as_index = False).agg({'no_sw_LDA_text': ' '.join})df.head()df['token_NN_text'] = df.no_sw_LDA_text.apply(lambda x: tokenize_text(x))dictionary = corpora.Dictionary(df.token_NN_text)# filter words appear less than 15 docs & more than 0.5 documents & keep only 100k most frequent wordsdictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)# document to 'bow' BAG OF WORDScorpus = [dictionary.doc2bow(doc) for doc in df.token_NN_text]len(corpus)len(dictionary)dictionary%%time# Creating the object for LDA model using gensim libraryLDA = gensim.models.ldamodel.LdaModel# Build LDA modellda_model = LDA(corpus= corpus, id2word = dictionary, num_topics = 30, random_state = 50, update_every = 1, chunksize = 100, passes = 10, #alpha = "auto" )# save modellda_model.save('QS_topics_lda.model')# num_topics =10, random_state = 100, chunksize=1000, passes =50, iterations = 100)from gensim import corpora, modelstfidf = models.TfidfModel(corpus)corpus_tfidf = tfidf[corpus]lda_model = gensim.models.LdaMulticore(corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)for idx, topic in lda_model.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic))lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)for idx, topic in lda_model_tfidf.print_topics(-1): print('Topic: {} Word: {}'.format(idx, topic))for index, score in sorted(lda_model[corpus[1]], key=lambda tup: -1*tup[1]): print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))for index, score in sorted(lda_model_tfidf[corpus[1]], key=lambda tup: -1*tup[1]): print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))unseen_document = 'How a Pentagon deal became an identity crisis for Google'unseen_document = lemma_sentence(unseen_document)unseen_document = tokenize_text(unseen_document)unseen_documentlda_modelbow_vector = dictionary.doc2bow(unseen_document)for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]): print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))x
import pyLDAvisimport pyLDAvis.gensim_models as gensimvisvis_data = gensimvis.prepare(lda_model, corpus, dictionary)pyLDAvis.display(vis_data)pyLDAvis.save_html(vis_data, 'ldamodel.html')vectorizer = CountVectorizer(analyzer='word', min_df=10, # minimum reqd occurences of a word stop_words='english', # remove stop words lowercase=True, # convert all words to lowercase token_pattern='[a-zA-Z0-9]{3,}', # num chars > 3 # max_features=50000, # max number of uniq words )data_vectorized = vectorizer.fit_transform(df.no_sw_LDA_text)# Materialize the sparse datadata_dense = data_vectorized.todense()# Compute Sparsicity = Percentage of Non-Zero cellsprint("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")# Sklearnfrom sklearn.decomposition import LatentDirichletAllocation, TruncatedSVDfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizerfrom sklearn.model_selection import GridSearchCV# Build LDA Modellda_model = LatentDirichletAllocation(n_components=20, max_iter=10, # Max learning iterations learning_method='online', random_state=100, # Random state batch_size=128, # n docs in each learning iter evaluate_every = -1, # compute perplexity every n iters, default: Don't n_jobs = -1, # Use all available CPUs )lda_output = lda_model.fit_transform(data_vectorized)print(lda_model) # Model attributesfrom pprint import pprintfrom sklearn.model_selection import GridSearchCV# Define Search Paramsearch_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}# Init the Modellda = LatentDirichletAllocation()# Init Grid Search Classmodel = GridSearchCV(lda, param_grid=search_params)# Do the Grid Searchmodel.fit(data_vectorized)# Best Modelbest_lda_model = model.best_estimator_# Model Parametersprint("Best Model's Params: ", model.best_params_)# Log Likelihood Scoreprint("Best Log Likelihood Score: ", model.best_score_)# Perplexityprint("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))#log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ #if gscore.parameters['learning_decay']==0.5]#log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ #if gscore.parameters['learning_decay']==0.7]#log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ #if gscore.parameters['learning_decay']==0.9]# Get Log Likelyhoods from Grid Search Outputn_topics = [10, 15, 20, 25, 30]log_likelyhoods_5 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.5]log_likelyhoods_7 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.7]log_likelyhoods_9 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.9]# Show graphplt.figure(figsize=(12, 8))plt.plot(n_topics, log_likelyhoods_5, label='0.5')plt.plot(n_topics, log_likelyhoods_7, label='0.7')plt.plot(n_topics, log_likelyhoods_9, label='0.9')plt.title("Choosing Optimal LDA Model")plt.xlabel("Num Topics")plt.ylabel("Log Likelyhood Scores")plt.legend(title='Learning decay', loc='best')plt.show()df.columnsx
# Create Document - Topic Matrixlda_output = best_lda_model.transform(data_vectorized)# column namestopicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]# index namesdocnames = ["Doc" + str(i) for i in range(len(df.no_sw_LDA_text))]# Make the pandas dataframedf_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)# Get dominant topic for each documentdominant_topic = np.argmax(df_document_topic.values, axis=1)df_document_topic['dominant_topic'] = dominant_topic# Stylingdef color_green(val): color = 'green' if val > .1 else 'black' return 'color: {col}'.format(col=color)def make_bold(val): weight = 700 if val > .1 else 400 return 'font-weight: {weight}'.format(weight=weight)# Apply Styledf_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)df_document_topicsdf_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")df_topic_distribution.columns = ['Topic Num', 'Num Documents']df_topic_distribution.sort_values(by='Topic Num')x
pyLDAvis.enable_notebook()plot = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')plot# Topic-Keyword Matrixdf_topic_keywords = pd.DataFrame(best_lda_model.components_)# Assign Column and Indexdf_topic_keywords.columns = vectorizer.get_feature_names()df_topic_keywords.index = topicnames# Viewdf_topic_keywords.head()# Show top n keywords for each topicdef show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20): keywords = np.array(vectorizer.get_feature_names()) topic_keywords = [] for topic_weights in lda_model.components_: top_keyword_locs = (-topic_weights).argsort()[:n_words] topic_keywords.append(keywords.take(top_keyword_locs)) return topic_keywordstopic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15) # Topic - Keywords Dataframedf_topic_keywords = pd.DataFrame(topic_keywords)df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]df_topic_keywordsdef sent_to_words(sentences): for sentence in sentences: yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuationsdef lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = nlp(" ".join(sent)) texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags])) return texts_out# Define function to predict topic for a given text document.nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])def predict_topic(text, nlp=nlp): global sent_to_words global lemmatization # Step 1: Clean with simple_preprocess mytext_2 = list(sent_to_words(text)) # Step 2: Lemmatize mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # Step 3: Vectorize transform mytext_4 = vectorizer.transform(mytext_3) # Step 4: LDA Transform topic_probability_scores = best_lda_model.transform(mytext_4) topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist() return topic, topic_probability_scores# Predict the topicmytext = ["sleep"]topic, prob_scores = predict_topic(text = mytext)print(topic)x
<a id='cluster_topic'></a>## Cluster documents with similair topics# Construct the k-means clustersfrom sklearn.cluster import KMeansclusters = KMeans(n_clusters=15, random_state=100).fit_predict(lda_output)# Build the Singular Value Decomposition(SVD) modelsvd_model = TruncatedSVD(n_components=2) # 2 componentslda_output_svd = svd_model.fit_transform(lda_output)# X and Y axes of the plot using SVD decompositionx = lda_output_svd[:, 0]y = lda_output_svd[:, 1]# Weights for the 15 columns of lda_output, for each componentprint("Component's weights: \n", np.round(svd_model.components_, 2))# Percentage of total information in 'lda_output' explained by the two componentsprint("Perc of Variance Explained: \n", np.round(svd_model.explained_variance_ratio_, 2))# Plotplt.figure(figsize=(12, 12))plt.scatter(x, y, c=clusters)plt.xlabel('Component 2')plt.ylabel('Component 1')plt.title("Segregation of Topic Clusters", )x
<a id='similar_topic'></a>## similar documents by entering text + Create dashboardglobal_df.columnsfrom sklearn.metrics.pairwise import euclidean_distancesnlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])def similar_documents(text, doc_topic_probs, documents = global_df.cleaned_text, nlp=nlp, top_n=5, verbose=False): topic, x = predict_topic(text) dists = euclidean_distances(x.reshape(1, -1), doc_topic_probs)[0] doc_ids = np.argsort(dists)[:top_n] if verbose: print("Topic KeyWords: ", topic) print("Topic Prob Scores of text: ", np.round(x, 1)) print("Most Similar Doc's Probs: ", np.round(doc_topic_probs[doc_ids], 1)) return doc_ids, np.take(documents, doc_ids)# Get similar documentsmytext = ["Some text about science and health"]doc_ids, docs = similar_documents(text=mytext, doc_topic_probs=lda_output, documents = global_df.cleaned_text, top_n=1, verbose=True)print('\n', docs)def format_topics_sentences(ldamodel= lda_model, corpus= corpus, texts = data_vectorized): # Init output sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row in enumerate(ldamodel[corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True) else: break sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'] # Add original text to the end of the output contents = pd.Series(texts) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) return(sent_topics_df)df_topic_sents_keywords = format_topics_sentences(ldamodel= best_lda_model, corpus= corpus, texts= data_vectorized)# Formatdf_dominant_topic = df_topic_sents_keywords.reset_index()df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']# Showdf_dominant_topic.head(10)# Group top 5 sentences under each topicsent_topics_sorteddf_mallet = pd.DataFrame()sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')for i, grp in sent_topics_outdf_grpd: sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], axis=0)# Reset Index sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)# Formatsent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]# Showsent_topics_sorteddf_mallet.head()# Number of Documents for Each Topictopic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()# Percentage of Documents for Each Topictopic_contribution = round(topic_counts/topic_counts.sum(), 4)# Topic Number and Keywordstopic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]# Concatenate Column wisedf_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)# Change Column namesdf_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']# Showdf_dominant_topics.head()len(global_df)"""conducted on Cortext platform"""org_name = pd.read_csv('name_entity/org-name.csv')product_name = pd.read_csv('name_entity/product-name-tsv.csv')works = pd.read_csv('name_entity/work-of-art-tsv.csv', sep="\t")person_name = pd.read_csv('name_entity/person.csv', sep="\t")event_name = pd.read_csv('name_entity/event.csv', sep="\t")loc1 = pd.read_csv('name_entity/loc1.csv', sep="\t")loc2 = pd.read_csv('name_entity/location2.csv', sep="\t")org_name = org_name.dropna()product_name = product_name.dropna()works = works.dropna()person_name = person_name.dropna()event_name = event_name.dropna()loc1 = loc1.dropna()loc2 = loc2.dropna()print(len(loc1))print(len(loc2))location = pd.concat([loc1, loc2], ignore_index=True)org_name['category'] = 'ORG'product_name['category'] = 'PRODUCT'works['category'] = 'WORKS'person_name['category'] = 'PERSON'event_name['category'] = 'EVENT'location['category'] = 'LOCATION'org_name.sort_values(by="frequency", ascending= False)org_name = org_name.dropna()org_name.entity.itemsimport plotly.graph_objs as gox
data = dict(org_name)# function to vizualise data on wordclouddef str_list(mylist): mylist = mylist.tolist() mylist = str(mylist) mylist = tokenize_text(mylist) mylist = ' '.join(map(str, mylist)) return mylistorg_list = str_list(org_name.entity)# Create and generate a word cloud image:wordcloud = WordCloud(width=1600, height=800, max_font_size=150, max_words=100, colormap='Set1', background_color="white", collocations=True).generate(org_list)# Display the generated image:plt.figure(figsize=(10,10))plt.imshow(wordcloud)plt.title('Wordcloud of products talked about')plt.axis("off")plt.show()# Create and generate a word cloud image:product = str_list(product_name.entity)wordcloud = WordCloud(width=1600, height=800, max_font_size=150, max_words=100, colormap='Set1', collocations=True, background_color="white").generate(product)# Display the generated image:plt.figure(figsize=(10,10))plt.imshow(wordcloud)plt.title('Wordcloud of organisation names talked about')plt.axis("off")plt.show()fig = px.scatter(org_name, x='frequency', y='entity', text='entity', size='frequency', color='frequency', size_max=45 , template='plotly_white', title='Bigram similarity and frequency', labels={'words': 'Avg. Length<BR>(words)'} , color_continuous_scale=px.colors.sequential.Sunsetdark)fig.update_traces(marker=dict(line=dict(width=1, color='Gray')))fig.update_xaxes(visible=False)fig.update_yaxes(visible=False)fig.show()